This .Rmd file, as well as the R file found in the same folder, contain the same overall R code. However, this file was created to gain experience using and written .Rmd files. It takes in a csv file of COVID-19 Data, such as positive cases, tests, and deaths.
Topics used in this file include: - Reading csv Files - Data Frames - Using the dplyr package - Filtering and summarizing Data - Creating plots using plotly and ggplot2 - Creating matrices
The csv file used in this file was retrieved from the following link: https://www.kaggle.com/datasets/lin0li/covid19testing?resource=download
First, the necessary libraries were listed.
# Libraries and packages that will be used
library(dplyr)
library(tibble)
library(ggplot2)
library(plotly)
Next, the csv file was donwload and stored. It was read in as a data frame.
# Get csv data
csv_name <- "tested_worldwide.csv"
covid_data <- read.csv(csv_name)
# Display the head and tail of the data
head(covid_data)
## Date Country_Region Province_State positive active hospitalized
## 1 2020-01-16 Iceland All States 3 NA NA
## 2 2020-01-17 Iceland All States 4 NA NA
## 3 2020-01-18 Iceland All States 7 NA NA
## 4 2020-01-20 South Korea All States 1 NA NA
## 5 2020-01-22 United States All States 0 NA NA
## 6 2020-01-22 United States Massachusetts 0 NA NA
## hospitalizedCurr recovered death total_tested daily_tested daily_positive
## 1 NA NA NA NA NA NA
## 2 NA NA NA NA NA 1
## 3 NA NA NA NA NA 3
## 4 NA NA NA 4 NA NA
## 5 NA NA 0 0 NA NA
## 6 NA NA 0 0 NA NA
A summary table of the data was also created.
# Show summary table of data
summary(covid_data)
## Date Country_Region Province_State positive
## Length:27641 Length:27641 Length:27641 Min. : 0
## Class :character Class :character Class :character 1st Qu.: 635
## Mode :character Mode :character Mode :character Median : 8044
## Mean : 89042
## 3rd Qu.: 52812
## Max. :9761481
## NA's :4242
## active hospitalized hospitalizedCurr recovered
## Min. : -10 Min. : 0 Min. : 0.0 Min. : 0
## 1st Qu.: 118 1st Qu.: 553 1st Qu.: 37.0 1st Qu.: 476
## Median : 2332 Median : 2592 Median : 280.0 Median : 3159
## Mean : 19030 Mean : 7495 Mean : 956.7 Mean : 25775
## 3rd Qu.: 15102 3rd Qu.: 8199 3rd Qu.: 808.0 3rd Qu.: 22167
## Max. :558636 Max. :89995 Max. :39055.0 Max. :811330
## NA's :9833 NA's :19231 NA's :13080 NA's :9626
## death total_tested daily_tested daily_positive
## Min. : 0 Min. : 0 Min. :-1243606 Min. :-15363
## 1st Qu.: 9 1st Qu.: 32191 1st Qu.: 473 1st Qu.: 7
## Median : 163 Median : 202654 Median : 3107 Median : 135
## Mean : 3074 Mean : 1485408 Mean : 19085 Mean : 1025
## 3rd Qu.: 1348 3rd Qu.: 844982 3rd Qu.: 11127 3rd Qu.: 659
## Max. :229238 Max. :136620652 Max. : 3760260 Max. :128396
## NA's :4010 NA's :912 NA's :1174 NA's :4557
Another number of different functions were used to gain exposure into how they work and to get key dimensions and features of the data frame.
# List the column names
colnames(covid_data)
## [1] "Date" "Country_Region" "Province_State" "positive"
## [5] "active" "hospitalized" "hospitalizedCurr" "recovered"
## [9] "death" "total_tested" "daily_tested" "daily_positive"
# Use glimpse to display data
glimpse(covid_data)
## Rows: 27,641
## Columns: 12
## $ Date <chr> "2020-01-16", "2020-01-17", "2020-01-18", "2020-01-20…
## $ Country_Region <chr> "Iceland", "Iceland", "Iceland", "South Korea", "Unit…
## $ Province_State <chr> "All States", "All States", "All States", "All States…
## $ positive <int> 3, 4, 7, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, NA, NA, NA,…
## $ active <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ hospitalized <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ hospitalizedCurr <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ recovered <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ death <int> NA, NA, NA, NA, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0, NA, NA…
## $ total_tested <dbl> NA, NA, NA, 4, 0, 0, 0, 0, 0, 0, 27, 0, 0, 0, NA, NA,…
## $ daily_tested <int> NA, NA, NA, NA, NA, NA, NA, 0, 0, 0, 5, 0, 0, 0, NA, …
## $ daily_positive <int> NA, 1, 3, NA, NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, NA, NA…
# Show list of unique countries listed in the Data
unique(covid_data$Country_Region)
## [1] "Iceland" "South Korea"
## [3] "United States" "Australia"
## [5] "United Kingdom" "Israel"
## [7] "Czechia" "Canada"
## [9] "Russia" "Armenia"
## [11] "Poland" "Italy"
## [13] "Estonia" "Greece"
## [15] "Lithuania" "Belgium"
## [17] "New Zealand" "Sweden"
## [19] "Latvia" "Costa Rica"
## [21] "Serbia" "Slovakia"
## [23] "Bangladesh" "Turkey"
## [25] "Kazakhstan" "Palestine"
## [27] "Brazil" "Bolivia"
## [29] "Grenada" "Spain"
## [31] "Ukraine" "Germany"
## [33] "Iran" "France"
## [35] "Ireland" "Uruguay"
## [37] "Egypt" "Singapore"
## [39] "Netherlands" "Argentina"
## [41] "Bahrain" "Chile"
## [43] "Jamaica" "Japan"
## [45] "Malaysia" "Malta"
## [47] "Panama" "Peru"
## [49] "Trinidad and Tobago" "Finland"
## [51] "Mexico" "Slovenia"
## [53] "Austria" "Colombia"
## [55] "Ecuador" "North Macedonia"
## [57] "Norway" "Portugal"
## [59] "South Africa" "Switzerland"
## [61] "United Arab Emirates" "Azerbaijan"
## [63] "Belarus" "Bosnia and Herzegovina"
## [65] "China" "Croatia"
## [67] "Hungary" "Indonesia"
## [69] "Montenegro" "Nepal"
## [71] "Pakistan" "Thailand"
## [73] "Denmark" "India"
## [75] "Kosovo" "Kyrgyzstan"
## [77] "Philippines" "Romania"
## [79] "Taiwan" "Venezuela"
## [81] "Vietnam" "Barbados"
## [83] "Scotland" "North Korea"
## [85] "Albania" "Bulgaria"
## [87] "Emilia-Romagna" "Liguria"
## [89] "Lombardy" "Marche"
## [91] "Piedmont" "Tuscany"
## [93] "Veneto" "Nigeria"
## [95] "Luxembourg" "Ghana"
## [97] "Tunisia" "Cameroon"
## [99] "Ivory Coast" "Kenya"
## [101] "Morocco" "Democratic Republic of the Congo"
## [103] "Uganda" "Burkina Faso"
## [105] "Cuba" "Czech Republic"
## [107] "Guinea" "Tanzania"
## [109] "DR Congo" "El Salvador"
## [111] "Qatar" "Malawi"
## [113] "Mozambique" "Myanmar"
## [115] "Cyprus" "Ethiopia"
## [117] "Iraq" "Paraguay"
## [119] "Rwanda" "Saudi Arabia"
## [121] "Uzbekistan" "Lebanon"
## [123] "Senegal" "Sudan"
## [125] "Northern Cyprus" "Mauritius"
## [127] "Oman" "Maldives"
## [129] "Bhutan" "Sri Lanka"
## [131] "Saint Lucia" "Afghanistan"
## [133] "Algeria" "Libya"
## [135] "Madagascar" "Faroe Islands"
## [137] "Greenland" "Fiji"
## [139] "Papua New Guinea" "Kuwait"
## [141] "Dominican Republic" "Gabon"
## [143] "Togo" "Guatemala"
## [145] "Honduras" "Jordan"
## [147] "Namibia"
# List the countries in alphabetically order
sort(unique(covid_data$Country_Region))
## [1] "Afghanistan" "Albania"
## [3] "Algeria" "Argentina"
## [5] "Armenia" "Australia"
## [7] "Austria" "Azerbaijan"
## [9] "Bahrain" "Bangladesh"
## [11] "Barbados" "Belarus"
## [13] "Belgium" "Bhutan"
## [15] "Bolivia" "Bosnia and Herzegovina"
## [17] "Brazil" "Bulgaria"
## [19] "Burkina Faso" "Cameroon"
## [21] "Canada" "Chile"
## [23] "China" "Colombia"
## [25] "Costa Rica" "Croatia"
## [27] "Cuba" "Cyprus"
## [29] "Czech Republic" "Czechia"
## [31] "Democratic Republic of the Congo" "Denmark"
## [33] "Dominican Republic" "DR Congo"
## [35] "Ecuador" "Egypt"
## [37] "El Salvador" "Emilia-Romagna"
## [39] "Estonia" "Ethiopia"
## [41] "Faroe Islands" "Fiji"
## [43] "Finland" "France"
## [45] "Gabon" "Germany"
## [47] "Ghana" "Greece"
## [49] "Greenland" "Grenada"
## [51] "Guatemala" "Guinea"
## [53] "Honduras" "Hungary"
## [55] "Iceland" "India"
## [57] "Indonesia" "Iran"
## [59] "Iraq" "Ireland"
## [61] "Israel" "Italy"
## [63] "Ivory Coast" "Jamaica"
## [65] "Japan" "Jordan"
## [67] "Kazakhstan" "Kenya"
## [69] "Kosovo" "Kuwait"
## [71] "Kyrgyzstan" "Latvia"
## [73] "Lebanon" "Libya"
## [75] "Liguria" "Lithuania"
## [77] "Lombardy" "Luxembourg"
## [79] "Madagascar" "Malawi"
## [81] "Malaysia" "Maldives"
## [83] "Malta" "Marche"
## [85] "Mauritius" "Mexico"
## [87] "Montenegro" "Morocco"
## [89] "Mozambique" "Myanmar"
## [91] "Namibia" "Nepal"
## [93] "Netherlands" "New Zealand"
## [95] "Nigeria" "North Korea"
## [97] "North Macedonia" "Northern Cyprus"
## [99] "Norway" "Oman"
## [101] "Pakistan" "Palestine"
## [103] "Panama" "Papua New Guinea"
## [105] "Paraguay" "Peru"
## [107] "Philippines" "Piedmont"
## [109] "Poland" "Portugal"
## [111] "Qatar" "Romania"
## [113] "Russia" "Rwanda"
## [115] "Saint Lucia" "Saudi Arabia"
## [117] "Scotland" "Senegal"
## [119] "Serbia" "Singapore"
## [121] "Slovakia" "Slovenia"
## [123] "South Africa" "South Korea"
## [125] "Spain" "Sri Lanka"
## [127] "Sudan" "Sweden"
## [129] "Switzerland" "Taiwan"
## [131] "Tanzania" "Thailand"
## [133] "Togo" "Trinidad and Tobago"
## [135] "Tunisia" "Turkey"
## [137] "Tuscany" "Uganda"
## [139] "Ukraine" "United Arab Emirates"
## [141] "United Kingdom" "United States"
## [143] "Uruguay" "Uzbekistan"
## [145] "Veneto" "Venezuela"
## [147] "Vietnam"
# Get dimensions of the data frame
dim(covid_data)
## [1] 27641 12
nrow(covid_data)
## [1] 27641
ncol(covid_data)
## [1] 12
str(covid_data)
## 'data.frame': 27641 obs. of 12 variables:
## $ Date : chr "2020-01-16" "2020-01-17" "2020-01-18" "2020-01-20" ...
## $ Country_Region : chr "Iceland" "Iceland" "Iceland" "South Korea" ...
## $ Province_State : chr "All States" "All States" "All States" "All States" ...
## $ positive : int 3 4 7 1 0 0 0 0 0 0 ...
## $ active : int NA NA NA NA NA NA NA NA NA NA ...
## $ hospitalized : int NA NA NA NA NA NA NA NA NA NA ...
## $ hospitalizedCurr: int NA NA NA NA NA NA NA NA NA NA ...
## $ recovered : int NA NA NA NA NA NA NA NA NA NA ...
## $ death : int NA NA NA NA 0 0 0 0 0 0 ...
## $ total_tested : num NA NA NA 4 0 0 0 0 0 0 ...
## $ daily_tested : int NA NA NA NA NA NA NA 0 0 0 ...
## $ daily_positive : int NA 1 3 NA NA NA NA 0 0 0 ...
The data was then filtered to only get rows involving the entire country. Additionally, the names of hte column were changed, and the NA values were replaced with 0.
# Filter data to only get rows where it involves the entire country
country_covid_data <- covid_data %>% filter(Province_State == "All States")
# Remove province column
country_covid_data <- country_covid_data %>% select(-Province_State)
head(country_covid_data)
## Date Country_Region positive active hospitalized hospitalizedCurr
## 1 2020-01-16 Iceland 3 NA NA NA
## 2 2020-01-17 Iceland 4 NA NA NA
## 3 2020-01-18 Iceland 7 NA NA NA
## 4 2020-01-20 South Korea 1 NA NA NA
## 5 2020-01-22 United States 0 NA NA NA
## 6 2020-01-23 United States 0 NA NA NA
## recovered death total_tested daily_tested daily_positive
## 1 NA NA NA NA NA
## 2 NA NA NA NA 1
## 3 NA NA NA NA 3
## 4 NA NA 4 NA NA
## 5 NA 0 0 NA NA
## 6 NA 0 0 0 0
# Change column names
colnames(country_covid_data) <- c("Date", "Country", "Positve", "Active",
"Hospitalized", "Hospitalized_Currently",
"Recovered", "Death", "Total_Tested",
"Daily_Tested", "Daily_Positive")
# Replace NA values with 0
country_covid_data <- country_covid_data %>% replace(is.na(.), 0)
head(country_covid_data)
## Date Country Positve Active Hospitalized Hospitalized_Currently
## 1 2020-01-16 Iceland 3 0 0 0
## 2 2020-01-17 Iceland 4 0 0 0
## 3 2020-01-18 Iceland 7 0 0 0
## 4 2020-01-20 South Korea 1 0 0 0
## 5 2020-01-22 United States 0 0 0 0
## 6 2020-01-23 United States 0 0 0 0
## Recovered Death Total_Tested Daily_Tested Daily_Positive
## 1 0 0 0 0 0
## 2 0 0 0 0 1
## 3 0 0 0 0 3
## 4 0 0 4 0 0
## 5 0 0 0 0 0
## 6 0 0 0 0 0
The data for only Canada was retrieved from the original data frame to be used to create different types of graphs.
# Filter data to only get the data for Canada
canada_data <- country_covid_data %>% filter(Country == "Canada")
canada_data <- canada_data %>% select(-Country)
canada_data$Date <- as.Date(canada_data$Date)
glimpse(canada_data)
## Rows: 258
## Columns: 10
## $ Date <date> 2020-01-31, 2020-02-08, 2020-02-16, 2020-02-21…
## $ Positve <dbl> 4, 7, 8, 9, 10, 11, 12, 13, 15, 24, 33, 45, 51,…
## $ Active <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Hospitalized <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Hospitalized_Currently <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Recovered <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Death <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,…
## $ Total_Tested <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Daily_Tested <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Daily_Positive <dbl> 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 4, 5, 6, 6, 0, 15…
First, the plot() function was used to create a line graph of daily positive cases by date.
# Create line graph of daily cases by date
plot(canada_data$Date, canada_data$Daily_Positive, type = "l", lwd = 2,
main = "Graph of Canada Daily Positive COVID Cases", xlab = "Date",
ylab = "Number of Cases")
Next, using the filtered Canada Data, a ggplot2 line graph was created to display active cases by day.
# Use ggplot2 to create line graph of active cases by day
ggplot(canada_data, aes(x = Date, y = Active)) + geom_point(color = "blue") +
labs(title = "Canada: Active COVID-19 Cases")
Lastly, a plotly bar graph was created by show deaths by day.
# Create bar graph of deaths by day using plotly
canada_bar_graph <- plot_ly(canada_data, x = ~Date, y = ~Death, type = "bar",
marker = list(color = "red"))
canada_bar_graph <- canada_bar_graph %>% layout(title = "Canada COVID Deaths",
bargrap = 0.2)
canada_bar_graph
## Warning: 'layout' objects don't have these attributes: 'bargrap'
## Valid attributes include:
## '_deprecated', 'activeshape', 'annotations', 'autosize', 'autotypenumbers', 'calendar', 'clickmode', 'coloraxis', 'colorscale', 'colorway', 'computed', 'datarevision', 'dragmode', 'editrevision', 'editType', 'font', 'geo', 'grid', 'height', 'hidesources', 'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend', 'mapbox', 'margin', 'meta', 'metasrc', 'modebar', 'newshape', 'paper_bgcolor', 'plot_bgcolor', 'polar', 'scene', 'selectdirection', 'selectionrevision', 'separators', 'shapes', 'showlegend', 'sliders', 'smith', 'spikedistance', 'template', 'ternary', 'title', 'transition', 'uirevision', 'uniformtext', 'updatemenus', 'width', 'xaxis', 'yaxis', 'barmode', 'bargap', 'mapType'
The data was furthered summarized by total cases, tests, and hospitalizations by country.
# Summarize data by total tests, cases, hospitalizations by country
country_daily_summary <- country_covid_data %>%
group_by(Country) %>%
summarise(Tested = sum(Daily_Tested), Positive = sum(Daily_Positive),
Hospitalized = sum(Hospitalized_Currently))
country_daily_summary <- country_daily_summary %>% arrange((desc(Positive)))
country_daily_summary
## # A tibble: 146 × 4
## Country Tested Positive Hospitalized
## <chr> <dbl> <dbl> <dbl>
## 1 United States 136937092 9850413 0
## 2 Italy 17370389 934875 2401146
## 3 Russia 11319603 432269 0
## 4 Bangladesh 2442470 420235 0
## 5 Czechia 2557224 411220 0
## 6 Canada 9873530 259992 0
## 7 Turkey 4351655 221499 0
## 8 United Kingdom 1460486 163418 0
## 9 Costa Rica 320327 116361 56929
## 10 Armenia 438837 106424 1768081
## # ℹ 136 more rows
Afterwards, to complete the analysis, the top three countries by total positive cases, tests, hospitalizations and positive test rate were determined and stored in a matrix.
# Get top three countries by positive cases
most_positive <- country_daily_summary
most_positive <- head(most_positive, 3)
top_positive_countries <- most_positive$Country
top_positive_countries
## [1] "United States" "Italy" "Russia"
# Get top three countries by total tests
most_tested <- country_daily_summary %>% arrange((desc(Tested)))
most_tested <- head(most_tested, 3)
top_tested_countries <- most_tested$Country
top_tested_countries
## [1] "United States" "India" "Italy"
# Get top three countries by hospitalizations
most_hospitalized <- country_daily_summary %>% arrange((desc(Hospitalized)))
most_hospitalized <- head(most_hospitalized, 3)
top_hospitalized_countries <- most_hospitalized$Country
top_hospitalized_countries
## [1] "Italy" "Armenia" "Singapore"
# Calculate testing rate
country_daily_summary$Rate <- country_daily_summary$Positive /
country_daily_summary$Tested
head(country_daily_summary)
## # A tibble: 6 × 5
## Country Tested Positive Hospitalized Rate
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 United States 136937092 9850413 0 0.0719
## 2 Italy 17370389 934875 2401146 0.0538
## 3 Russia 11319603 432269 0 0.0382
## 4 Bangladesh 2442470 420235 0 0.172
## 5 Czechia 2557224 411220 0 0.161
## 6 Canada 9873530 259992 0 0.0263
# Get top three countries with highest positive test rate
high_rate <- country_daily_summary %>% arrange((desc(Rate)))
high_rate <- head(high_rate, 3)
highest_rate_countries <- high_rate$Country
highest_rate_countries
## [1] "Iceland" "Costa Rica" "Scotland"
# Create matrix with the leaders in each category
covid_leaders <- rbind(top_positive_countries, top_tested_countries,
top_hospitalized_countries, highest_rate_countries)
covid_leaders
## [,1] [,2] [,3]
## top_positive_countries "United States" "Italy" "Russia"
## top_tested_countries "United States" "India" "Italy"
## top_hospitalized_countries "Italy" "Armenia" "Singapore"
## highest_rate_countries "Iceland" "Costa Rica" "Scotland"